Cell lines data

Fig 1. Cell lines validation

seed = 65
set.seed(seed)
#print(as.matrix(colnames(big_df)))
col_id <- grep('[(]v[)]', colnames(big_df))
colnames(big_df) <- gsub('[(]v[)]', '', colnames(big_df))
smpl <- denoisingCTF::t_asinh(big_df)

# UMAP
# umap_smp <- umap::umap(smpl[,col_id])
# umap_smp <- as.data.frame(cbind(smpl,
#                                     UMAP1 = umap_smp$layout[,1],
#                                     UMAP2 = umap_smp$layout[,2]))
# save(umap_smp, file = "UMAPcelllines_paper2020.RData")
load("~/Documents/Massion_lab/manuscripts/prelim_cytof_adc/data/cell_lines/processed_files/mix/UMAPcelllines_paper2020.RData")

## TableGrob (4 x 3) "arrange": 12 grobs
##     z     cells    name           grob
## 1   1 (1-1,1-1) arrange gtable[layout]
## 2   2 (1-1,2-2) arrange gtable[layout]
## 3   3 (1-1,3-3) arrange gtable[layout]
## 4   4 (2-2,1-1) arrange gtable[layout]
## 5   5 (2-2,2-2) arrange gtable[layout]
## 6   6 (2-2,3-3) arrange gtable[layout]
## 7   7 (3-3,1-1) arrange gtable[layout]
## 8   8 (3-3,2-2) arrange gtable[layout]
## 9   9 (3-3,3-3) arrange gtable[layout]
## 10 10 (4-4,1-1) arrange gtable[layout]
## 11 11 (4-4,2-2) arrange gtable[layout]
## 12 12 (4-4,3-3) arrange gtable[layout]
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess

## Fig 2. Clustering cell lines

# find optimal number of clusters (elbow plot)
# DetermineNumberOfClusters(smpl[,col_id],k_max=45,plot=T,smooth=0.2, iter.max=50, seed = 45, 
#                           ask_ft = F,arcsn_tr = F)
# k-means clustering
set.seed(45)
cl_data <- stats::kmeans(smpl[,col_id], centers = 8, iter.max = 50)$cluster
df_cluster <- cbind(smpl[,col_id], 'cluster'= cl_data, "cl_ID"=smpl$cl_ID, umap_smp[c('UMAP1', 'UMAP2')])

# proyect clusters on UMAP
kl <- c("gray60", "#AAD7AC", "#F9C769", "steelblue1", "#B27CC1", "lightskyblue1", "#F2BFB9", "cornflowerblue")
ClusterUMAP_plot(df_cluster, color_by_cluster = T, color_by_protein = F, pallete = F,
                 cluster_col = 'cluster', plot_clusnames = T, plot_title = 'Cluster ID',
                 lg_names = levels(as.factor(df_cluster$cluster)),  
                 x_lim = c(-14,11), y_lim = c(-10,14), plot_colors = kl)

# barplot
prcnt_by_cl <- ClassAbundanceByPt(data=df_cluster, ptID_col = 'cluster', 
    class_col = 'cl_ID')
prcnt_by_cl <- prcnt_by_cl[order(row.names(prcnt_by_cl)),]
#rownames(prcnt_by_cl) <- paste0(rownames(prcnt_by_pt), '_', ref$CANARY)
prcnt_by_cl <- prcnt_by_cl*100
ct <- c("#EF7E33", "#52A02E", "#D7392E", "#9467BD", "#2977B4")
bp_order <- c(4,8,6,1,5,7,2,3)
dendrogram_barplot(prcnt_by_cl, dist_method = 'euclidean', 
                   hclust_method = 'complete', coul = ct, BPOrderAsDendrogram=F, 
                   bp_order=bp_order, xlabl = "Cell type (% of cluster)" , ylabl = "Cluster ID", 
                   cex_names = 1, cex_axis = 1.2,cex_lab = 0.5) #cex.sub and cex.lab do nothing

Supplementary Fig 1

load("~/Documents/Massion_lab/manuscripts/prelim_cytof_adc/data/cell_lines/processed_files/cell_lines_replicates_gated/cell_lines_val_labeled.RData")
#print(as.matrix(colnames(big_df)))
col_id <- grep('[(]v[)]', colnames(big_df_ctl))
#x <- c(15, 37, 41, 38, 26, 18, 45, 29, 46, 34, 44, 47)
colnames(big_df_ctl) <- gsub('[(]v[)]', '', colnames(big_df_ctl))
df <- denoisingCTF::t_asinh(big_df_ctl[,col_id])
colnames(df) <- gsub(" ","", sapply(strsplit(as.character(colnames(df)), "_"), "[[", 2))
df <- cbind(df, cl_ID=big_df_ctl$cl_ID, sample_ID=big_df_ctl$sample_ID)
df <- melt(df)
## Using cl_ID, sample_ID as id variables
# A549
ggplot(df[which(df$cl_ID=='A549'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

# H23
ggplot(df[which(df$cl_ID=='H23'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

# H3122
ggplot(df[which(df$cl_ID=='H3122'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

# Monocytes
ggplot(df[which(df$cl_ID=='Monocytes'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

# PC9
ggplot(df[which(df$cl_ID=='PC9'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

# Tc.cells
ggplot(df[which(df$cl_ID=='Tc.cells'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

# Th.cells
ggplot(df[which(df$cl_ID=='Th.cells'),], aes(x=value, group=sample_ID, color=sample_ID)) +
    geom_density(adjust=1.5) + facet_wrap( ~ variable, scales="free")

Patient samples

Fig 3

Fig 3A - Cell type proyection (UMAP)

# Changing Tc_cells and Th_cells
umap_smp$subtype <- as.character(umap_smp$subtype)
umap_smp$subtype4 <- as.character(umap_smp$subtype4)
k <- which(umap_smp$subtype =='Tc_cells')
umap_smp[k, 'subtype'] <- 'CD8T_cells'
umap_smp[k, 'subtype4'] <- 'CD8T_cells'
k <- which(umap_smp$subtype =='Th_cells')
umap_smp[k, 'subtype'] <- 'CD4T_cells'
umap_smp[k, 'subtype4'] <- 'CD4T_cells'
umap_smp$subtype <- as.factor(umap_smp$subtype)
umap_smp$subtype4 <- as.factor(umap_smp$subtype4)
umap_smp$subtype4 <- factor(umap_smp$subtype4, levels = c("Epithelial", "Endothelial", "Mesenchymal", "Other_immune", "Myeloid", "CD8T_cells", "CD4T_cells"))


umap_smp['subtype4'] <- as.character(umap_smp$subtype)
umap_smp[which(umap_smp$cell_type=='Epithelial'), 'subtype4'] <- 'ECC'
umap_smp$subtype4 <- as.factor(umap_smp$subtype4)
umap_smp$subtype4 <- factor(umap_smp$subtype4, levels = c("ECC", "Endothelial", "Mesenchymal", "Other_immune", "Myeloid", "CD8T_cells", "CD4T_cells"))

Fig 3B - Protein markers in UMAP proyection

## TableGrob (2 x 6) "arrange": 12 grobs
##     z     cells    name           grob
## 1   1 (1-1,1-1) arrange gtable[layout]
## 2   2 (1-1,2-2) arrange gtable[layout]
## 3   3 (1-1,3-3) arrange gtable[layout]
## 4   4 (1-1,4-4) arrange gtable[layout]
## 5   5 (1-1,5-5) arrange gtable[layout]
## 6   6 (1-1,6-6) arrange gtable[layout]
## 7   7 (2-2,1-1) arrange gtable[layout]
## 8   8 (2-2,2-2) arrange gtable[layout]
## 9   9 (2-2,3-3) arrange gtable[layout]
## 10 10 (2-2,4-4) arrange gtable[layout]
## 11 11 (2-2,5-5) arrange gtable[layout]
## 12 12 (2-2,6-6) arrange gtable[layout]

Fig 3C - Barplot cell types + dendrogram

## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.

Fig 3D - Correlation plot Immune cells vs Endothelial cells

## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

Supplementary info

S. Figure on cell type abundances G vs P

## 
## Attaching package: 'operators'
## The following objects are masked from 'package:base':
## 
##     options, strrep
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:operators':
## 
##     %>%
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
##      cell_type   p.value FDR_corrected
## 1      Myeloid 0.3523810     0.6166667
## 2 Other_immune 0.7619048     0.8888889
## 3   CD4T_cells 0.3523810     0.6166667
## 4   CD8T_cells 0.1714286     0.6000000
## 5   Epithelial 0.9142857     0.9142857
## 6  Mesenchymal 0.4761905     0.6666667
## 7  Endothelial 0.1714286     0.6000000
## Using pt_ID, CANARY as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

S. Figure on protein expression by cell type G vs P

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

## Warning in wilcox.test.default(exp_median[ig1, i], exp_median[ig2, i]): cannot
## compute exact p-value with ties

All cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(2.63407697361086, 0.0695405059165581,
## 2.61931335471328, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.347064923349515, 0.756454263937334,
## 0.461532710617599, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(6.85846430892733e-05, 0.176901427304811, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.604787413934418, 2.00996443199906,
## 0.22811706521791, : cannot compute exact p-value with ties

##              Protein    p.value FDR_corrected
## 1        141Pr_EpCAM 0.76190476     1.0000000
## 2       142Nd_ccasp3 1.00000000     1.0000000
## 3      144Nd_HLA-ABC 0.06910652     0.6095238
## 4        147Sm_b-CAT 0.10550173     0.6095238
## 5         151Eu_TTF1 0.22976627     0.9190651
## 6         154Sm_CD45 0.60952381     1.0000000
## 7         155Gd_CD56 1.00000000     1.0000000
## 8     156Gd_Vimentin 0.91428571     1.0000000
## 9      158Gd_p-STAT3 0.91428571     1.0000000
## 10        160Gd_MDM2 0.91428571     1.0000000
## 11 161Dy_Cytokeratin 0.91428571     1.0000000
## 12        163Dy_TP63 0.60952381     1.0000000
## 13         164Dy_CK7 1.00000000     1.0000000
## 14        166Er_CD44 0.47619048     1.0000000
## 15         170Yb_CD3 0.76190476     1.0000000
## 16      174Yb_HLA-DR 0.11428571     0.6095238

Endothelial cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

##              Protein   p.value FDR_corrected
## 1      144Nd_HLA-ABC 0.1714286     0.7619048
## 2         145Nd_CD31 0.4761905     0.7619048
## 3  146Nd_Thioredoxin 0.3523810     0.7619048
## 4        147Sm_b-CAT 0.4761905     0.7619048
## 5     156Gd_Vimentin 1.0000000     1.0000000
## 6      158Gd_p-STAT3 0.7619048     0.8465608
## 7         160Gd_MDM2 0.6095238     0.7619048
## 8  161Dy_Cytokeratin 0.4761905     0.7619048
## 9         163Dy_TP63 0.6095238     0.7619048
## 10      174Yb_HLA-DR 0.1714286     0.7619048

Fibroblasts/Mesenchymal cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(0, 0.0958118253303701, 0.284162729407862, :
## cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.802649816095183, 0.215492384630114,
## 0.0605647943338007, : cannot compute exact p-value with ties

##          Protein    p.value FDR_corrected
## 1    141Pr_EpCAM 0.76190476     0.9142857
## 2   142Nd_ccasp3 0.91428571     0.9142857
## 3     155Gd_CD56 0.47619048     0.8571429
## 4 156Gd_Vimentin 0.91428571     0.9142857
## 5  158Gd_p-STAT3 0.11428571     0.5142857
## 6     160Gd_MDM2 0.19945761     0.5983728
## 7     163Dy_TP63 0.47619048     0.8571429
## 8      170Yb_CD3 0.91406196     0.9142857
## 9   174Yb_HLA-DR 0.03809524     0.3428571

Immune cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(3.06708656196431, 0.066910551904235,
## 3.04431780411568, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.202586121632867, 0.0645635010399363, : cannot
## compute exact p-value with ties
## Warning in wilcox.test.default(c(0.501727573634015, 0.130067040154415,
## 0.712432219756351, : cannot compute exact p-value with ties

##           Protein   p.value FDR_corrected
## 1   144Nd_HLA-ABC 0.1055017     0.8571429
## 2      154Sm_CD45 0.4761905     0.9523810
## 3  156Gd_Vimentin 0.4761905     0.9523810
## 4   158Gd_p-STAT3 0.7619048     0.9523810
## 5       159Tb_CD4 0.9140620     1.0000000
## 6      163Dy_TP63 0.6095238     0.9523810
## 7      166Er_CD44 1.0000000     1.0000000
## 8       170Yb_CD3 0.7619048     0.9523810
## 9     171Yb_CD11b 0.3314247     0.9523810
## 10   174Yb_HLA-DR 0.1714286     0.8571429

CD8 T cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

##          Protein    p.value FDR_corrected
## 1  144Nd_HLA-ABC 0.03174603     0.2539683
## 2     154Sm_CD45 0.90476190     0.9047619
## 3 156Gd_Vimentin 0.55555556     0.8344671
## 4     163Dy_TP63 0.19047619     0.5079365
## 5     166Er_CD44 0.73015873     0.8344671
## 6      168Er_CD8 0.55555556     0.8344671
## 7      170Yb_CD3 0.73015873     0.8344671
## 8   174Yb_HLA-DR 0.11111111     0.4444444

CD4 T cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(0, 0.0561621657336121, 0, 0), c(0, 0,
## 1.50456532271426, : cannot compute exact p-value with ties

##          Protein   p.value FDR_corrected
## 1  144Nd_HLA-ABC 0.6095238     0.9142857
## 2     154Sm_CD45 0.3523810     0.7928571
## 3 156Gd_Vimentin 0.2571429     0.7714286
## 4      159Tb_CD4 0.6095238     0.9142857
## 5     163Dy_TP63 0.7619048     0.9795918
## 6     166Er_CD44 0.1714286     0.7714286
## 7     169Tm_CD24 1.0000000     1.0000000
## 8      170Yb_CD3 0.2571429     0.7714286
## 9   174Yb_HLA-DR 0.9142857     1.0000000

Myeloid cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(3.30075534359615, 0.0972935332589382,
## 3.31354462816258, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1.62386202915046, 0.717296683223608,
## 2.44717234480325, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.748820669157824, 0.123253874437529,
## 1.67192224879611, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(0.63662474810693, 0.0913139040611759,
## 1.20841095112529, : cannot compute exact p-value with ties

##              Protein   p.value FDR_corrected
## 1      144Nd_HLA-ABC 0.1645218     0.9523810
## 2         145Nd_CD31 0.4541743     0.9523810
## 3  146Nd_Thioredoxin 0.5929097     0.9846154
## 4         154Sm_CD45 0.4761905     0.9523810
## 5     156Gd_Vimentin 0.4761905     0.9523810
## 6      158Gd_p-STAT3 0.3523810     0.9523810
## 7          159Tb_CD4 0.7461278     0.9846154
## 8         160Gd_MDM2 0.9142857     0.9846154
## 9         163Dy_TP63 1.0000000     1.0000000
## 10        166Er_CD44 0.9142857     0.9846154
## 11       171Yb_CD11b 0.4761905     0.9523810
## 12        172Yb_p-S6 0.9142857     0.9846154
## 13      174Yb_HLA-DR 0.2571429     0.9523810
## 14       175Lu_PD-L1 0.7619048     0.9846154

Epithelial cells

## Using CANARY, pt_ID as id variables
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in wilcox.test.default(c(2.33079293987923, 0.0582644008491126,
## 1.75087785796523, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1.63156525887895, 0.322305842850225,
## 1.32203451663395, : cannot compute exact p-value with ties
## Warning in wilcox.test.default(c(1.13653172069154, 0.0527562183492093,
## 1.50292979733778, : cannot compute exact p-value with ties

##              Protein    p.value FDR_corrected
## 1        141Pr_EpCAM 0.11428571     0.4114286
## 2       142Nd_ccasp3 0.35238095     0.5714286
## 3      144Nd_HLA-ABC 0.16452182     0.4408163
## 4        147Sm_b-CAT 0.17142857     0.4408163
## 5         148Nd_HER2 0.03809524     0.4114286
## 6         151Eu_TTF1 0.09898793     0.4114286
## 7         154Sm_CD45 0.47619048     0.5714286
## 8         155Gd_CD56 0.76190476     0.8067227
## 9     156Gd_Vimentin 0.47619048     0.5714286
## 10     158Gd_p-STAT3 0.35238095     0.5714286
## 11        160Gd_MDM2 0.91428571     0.9142857
## 12 161Dy_Cytokeratin 0.11428571     0.4114286
## 13         162Dy_MET 0.25714286     0.5142857
## 14        163Dy_TP63 0.47619048     0.5714286
## 15         164Dy_CK7 0.47619048     0.5714286
## 16        165Ho_EGFR 0.10550173     0.4114286
## 17         170Yb_CD3 0.60952381     0.6857143
## 18      174Yb_HLA-DR 0.25714286     0.5142857
ref_samples <- ref
# Check HLA-DR expression in controls
load("/Users/senosam/Documents/Massion_lab/CyTOF_summary/controls/annotated_controls.RData")
ref_clt <- ref[which(ref$CyTOF_date %in% ref_samples$CyTOF_date),]
annot_df_ctl <- annot_df_ctl[which(annot_df_ctl$CyTOF_date %in% ref_samples$CyTOF_date),]
ramos <- annot_df_ctl[which(annot_df_ctl$cell_line=='Ramos'),]
a549 <- annot_df_ctl[which(annot_df_ctl$cell_line=='A549'),]
ramos_median <- aggregate(denoisingCTF::t_asinh(ramos[,c(15, 17:31, 33:35, 37:48, 50:51)]), list(ramos[,'CyTOF_date']), median)
a549_median <- aggregate(denoisingCTF::t_asinh(a549[,c(15, 17:31, 33:35, 37:48, 50:51)]), list(a549[,'CyTOF_date']), median)
a549_median['Cell_line'] <- rep('A549', nrow(a549_median))
ramos_median['Cell_line'] <- rep('Ramos', nrow(ramos_median))
ctl_median <- rbind(a549_median, ramos_median)

ggplot(ctl_median, aes(x=Cell_line, y=`174Yb_HLA-DR`, fill = Cell_line)) +
  geom_dotplot(binaxis='y', stackdir='center', dotsize=0.8) +
  ylim(0,6)+
  #facet_wrap(~variable) +
  theme(plot.title = element_text(hjust = 0.5, size=22),
        axis.text.x = element_text(size = 14),
        axis.text.y = element_text(size = 14))
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

summary(ctl_median$`174Yb_HLA-DR`[1:4])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.4604  0.7593  0.8666  0.8193  0.9267  1.0836
summary(ctl_median$`174Yb_HLA-DR`[5:8])
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.186   3.244   4.329   3.726   4.812   5.059

Fig 4

Fig 4A - Epithelial clusters proyection (UMAP)

umap_smp['subtype'] <- gsub('Epithelial', 'ECCc' ,umap_smp$subtype)
umap_smp$subtype <- as.factor(umap_smp$subtype)
umap_smp$subtype <- factor(umap_smp$subtype, levels = c(paste0("ECCc_", 1:10)))
#umap_smp$subtype4 <- factor(umap_smp$subtype4, levels = c("ECC", "Endothelial", "Mesenchymal", "Other_immune", "Myeloid", "Tc_cells", "Th_cells"))
## Warning: Removed 28 rows containing non-finite values (stat_density2d).
## Warning: Removed 28 rows containing missing values (geom_point).

## Warning: Removed 28 rows containing missing values (geom_point).

## Warning: Removed 28 rows containing missing values (geom_point).

## Warning in if (density) {: the condition has length > 1 and only the first
## element will be used
## Warning: Removed 2288 rows containing non-finite values (stat_density2d).
## Warning: Removed 2288 rows containing missing values (geom_point).

Fig 4B - Barplot cell types + dendrogram

## Bootstrap (r = 0.5)... Done.
## Bootstrap (r = 0.6)... Done.
## Bootstrap (r = 0.7)... Done.
## Bootstrap (r = 0.8)... Done.
## Bootstrap (r = 0.9)... Done.
## Bootstrap (r = 1.0)... Done.
## Bootstrap (r = 1.1)... Done.
## Bootstrap (r = 1.2)... Done.
## Bootstrap (r = 1.3)... Done.
## Bootstrap (r = 1.4)... Done.

Fig 4C - Protein markers by cluster heatmap

Fig 4D - Correlation plot T cells vs ECCc

## Warning: Ignoring unknown parameters: label.size
## `geom_smooth()` using formula 'y ~ x'

## Warning: Ignoring unknown parameters: label.size
## `geom_smooth()` using formula 'y ~ x'

Supplementary info for reviewers

Validation with TCGA data

High hla-dr vs low hla-dr association with T cells (xCell deconvolution)

# xCell deconvolution TCGA data 
xcell <- read.delim("https://xcell.ucsf.edu/xCell_TCGA_RSEM.txt", row.names = 1)
cn_xc <- sapply(strsplit(colnames(xcell), ".01$"), "[[", 1)
cn_xc <- gsub('\\.', '_', cn_xc)
colnames(xcell) <- cn_xc
xcell <- xcell[,which(colnames(xcell) %in% tcga_adc_clinical$submitter_id)]
label_by_gene <- function(normalized_data, gene, meta_data, extremes_only=FALSE, cond_name){
    if(extremes_only){
        data_t <- data.frame(t(normalized_data))
        first_q <- quantile(data_t[,gene])[["25%"]]
        third_q <- quantile(data_t[,gene])[["75%"]]
        data_t$Condition <- "normal"
        data_t$Condition[data_t[,gene] < first_q] <- "low"
        data_t$Condition[data_t[,gene] > third_q] <- "high"

    } else {
        data_t <- data.frame(t(normalized_data))
        mid_q <- quantile(data_t[,gene])[["50%"]]
        data_t$Condition <- "low"
        data_t$Condition[data_t[,gene]  > mid_q] <- "high"

    }
    cn <- c(colnames(meta_data), cond_name)
    meta_data <- cbind(meta_data, x=data_t$Condition)
    meta_data$x <- as.factor(meta_data$x)
    colnames(meta_data) <- cn

    meta_data
}

# Labeling the data based on high vs low expression
hla_genes <- gsub('-', '.', rownames(dt_hla))
pdt <- tcga_adc_clinical
for(i in hla_genes){
  pdt <- label_by_gene(log2(dt_hla+1), gene=i, meta_data = pdt, extremes_only=TRUE, cond_name=i)
}
# Selecting cell types of interest
xcell_ct <- rownames(xcell)[c(6,7,11:13)]

# Subsetting a df with the xcell data to be used
x <- t(xcell)
x_clin <- pdt[rownames(x),]
x <- x[,xcell_ct]
x <- cbind(x, x_clin[,70:ncol(x_clin)])
x <- na.omit(x)
x <- reshape2::melt(x)
## Using HLA.DRA, HLA.DRB5, HLA.DRB6, HLA.DRB1, HLA.DQA1, HLA.DQB1, HLA.DQA2, HLA.DQB2, HLA.DOB, HLA.DMB, HLA.DMA, HLA.DOA, HLA.DPA1, HLA.DPB1, HLA.DPB2 as id variables
# Generating summary table
sum_df <- data.frame(matrix(nrow = 0, ncol = 5))
for (i in colnames(x)[grep("H",colnames(x))]){
  for (ii in unique(x$variable)) {
    k <- which(x$variable == ii)
    x_sub <- x[k,]
    high_i <- which(x_sub[,i] == 'high')
    low_i <- which(x_sub[,i] == 'low')
    high_med <- median(x_sub[high_i,'value'])
    low_med <- median(x_sub[low_i,'value'])
    pval <- wilcox.test(x_sub[high_i,'value'], x_sub[low_i,'value'])
    sum_df <- rbind(sum_df, c(i, ii, high_med, low_med, pval$p.value))
    #cat(high_med, low_med, pval$p.value)
  }
}
colnames(sum_df) <- c('Gene', 'Cell type', 'Median.High', 'Median.Low', 'p.value')
sum_df['p.adjusted'] <- p.adjust(sum_df$p.value, method = "BH")
#write.csv(sum_df, file = 'summary_xcellTCGA.csv')

Generating plots

violin_deconv <- function(x, ct, cluster_ID) {
  k <- which(x$variable %in% ct)
  x_sub <- x[k,c(cluster_ID, "variable", "value")]
  k2 <- which(!x_sub[,cluster_ID] == 'normal')
  x_sub <- x_sub[k2,]
  
  ggplot(x_sub, aes_string(x=cluster_ID, y='value', fill=cluster_ID)) + 
    geom_violin(trim=FALSE) +
    geom_boxplot(width=0.1, fill="white") +
    labs(title=NULL,x="Patient group", y = "Enrichment score") +
    scale_fill_brewer(palette="Dark2",name = gsub('\\.', '-', cluster_ID)) +
    ggsignif::geom_signif(comparisons = list(c("low", "high")),   
                map_signif_level=TRUE) +
    facet_wrap(~variable, scales='free')+
    theme_bw()+
    ggtitle(gsub('\\.', '-', cluster_ID))+
    theme(strip.text.x = element_text(size = 12), 
          axis.text.x =element_text(size = 10), 
          axis.text.y =element_text(size = 10),
          axis.title=element_text(size=12),
          plot.title = element_text(size=20, hjust = 0.5, face="italic"),
          legend.position = "none")
    

}
violin_deconv(x, ct=xcell_ct[c(1,4)], cluster_ID = "HLA.DRA")

violin_deconv(x, ct=xcell_ct[c(1,4)], cluster_ID = "HLA.DRB1")

Rectifying clinical data info (latest update)

S. Figure on ECC clusters abundances G vs P